package com.dlmap.readbook.utils;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class WebUtils {
	
	public String getHtml(String htmlurl) throws IOException {
		URL url;
		String temp;
		StringBuffer sb = new StringBuffer();
		try {
			url = new URL(htmlurl);
			BufferedReader in = new BufferedReader(new InputStreamReader(url
					.openStream(), "utf-8"));// 读取网页全部内容
			while ((temp = in.readLine()) != null) {
				sb.append(temp);
				sb.append("\r\n");
			}
			in.close();
		} catch (MalformedURLException me) {
			System.out.println("你输入的URL格式有问题！请仔细输入");
			me.getMessage();
			throw me;
		} catch (IOException e) {
			e.printStackTrace();
			throw e;
		}
		return sb.toString();
	}
	public String getContent(String start,String end,String input) throws Exception{
		String regex = start + ".*?" + end;
		Pattern pattern = Pattern.compile(regex, Pattern.DOTALL);
		Matcher matcher = pattern.matcher(input);
		String retVal = "";
		if (matcher.find()) {
			retVal = matcher.group();
		}
		return outTag(br2nl(retVal));
	}
	
	public String outTag(String s) {
		return s.replaceAll("<.*?>", "");
	}
	
	public String br2nl(String s){
		s = s.replaceAll("&nbsp;", " ");
		s = s.replaceAll("<br />", "\r\n");
		s = s.replaceAll("<br/>", "\r\n");
		
		return s.replaceAll("<br>", "\r\n");
	}
	/**
	 * @param args
	 */
	public static void main(String[] args) throws Exception{
		// TODO Auto-generated method stub
		WebUtils web = new WebUtils();
		String tmp = web.getHtml("http://ts.hjenglish.com/page/74179/");
		tmp = new String(tmp.getBytes("iso-8859-1"), "GBK");
		String c = web.getContent("<div class=\"mainContent_detail_main\">", "\"></script><br>", tmp);
		System.out.println(c);
	}

}
